Packages

library(dplyr)
library(dygraphs)
library(forecast)
library(h2o)
library(imputeTS)
library(lubridate)
library(plotly)
library(readxl)
library(TSstudio)

Univariate time series data for U.S. chicken prices

Data set: U.S. average price of fresh, whole chicken in dollars per pound Period: January 1, 1980, to March 1, 2022 Seasonally adjusted: No Source: Economic Research data from the Federal Reserve Bank of St. Louis (FRED) Series: APU0000706111 Web address: https://fred.stlouisfed.org/series/APU0000706111

## Obtain data
# set working directory to location of downloaded data files
path = 
"C:/Users/keoka/OneDrive - Texas A&M University/Courses/STAT_684/Project/Data"
setwd(path)
ch_df = read_excel("APU0000706111.xls") # data series for chicken prices
## New names:
## * `` -> ...2
## Examine data
class(ch_df) # table, data frame
## [1] "tbl_df"     "tbl"        "data.frame"
dim(ch_df) # 517 rows, 2 columns
## [1] 517   2
str(ch_df); summary(ch_df) # tibble with two character variables
## tibble [517 x 2] (S3: tbl_df/tbl/data.frame)
##  $ FRED Graph Observations: chr [1:517] "Federal Reserve Economic Data" "Link: https://fred.stlouisfed.org" "Help: https://fredhelp.stlouisfed.org" "Economic Research Division" ...
##  $ ...2                   : chr [1:517] NA NA NA NA ...
##  FRED Graph Observations     ...2          
##  Length:517              Length:517        
##  Class :character        Class :character  
##  Mode  :character        Mode  :character
head(ch_df,10+5); tail(ch_df,5) # view first 5 and last 5 observations
## # A tibble: 15 x 2
##    `FRED Graph Observations`             ...2                                   
##    <chr>                                 <chr>                                  
##  1 Federal Reserve Economic Data         <NA>                                   
##  2 Link: https://fred.stlouisfed.org     <NA>                                   
##  3 Help: https://fredhelp.stlouisfed.org <NA>                                   
##  4 Economic Research Division            <NA>                                   
##  5 Federal Reserve Bank of St. Louis     <NA>                                   
##  6 <NA>                                  <NA>                                   
##  7 APU0000706111                         Average Price: Chicken, Fresh, Whole (~
##  8 <NA>                                  <NA>                                   
##  9 Frequency: Monthly                    <NA>                                   
## 10 observation_date                      APU0000706111                          
## 11 29221                                 0.69899999999999995                    
## 12 29252                                 0.67300000000000004                    
## 13 29281                                 0.65500000000000003                    
## 14 29312                                 0.63800000000000001                    
## 15 29342                                 0.628
## # A tibble: 5 x 2
##   `FRED Graph Observations` ...2              
##   <chr>                     <chr>             
## 1 44501                     1.583             
## 2 44531                     1.6060000000000001
## 3 44562                     1.6220000000000001
## 4 44593                     1.6319999999999999
## 5 44621                     1.724
names(ch_df) # variable names: "FRED Graph Observations" and "...2"
## [1] "FRED Graph Observations" "...2"
## Reformat data
ch_df <- ch_df[11:517,] # exclude description of data set in first 10 rows
names(ch_df) <- c("date","chicken price") # rename variables
ch_df[,1] <- as.Date(as.numeric(unlist(ch_df[,1])), # character to numeric date
                  origin = as.Date("1899-12-30")) # align to Excel origin point
ch_df[,2] <- round(as.numeric(unlist(ch_df[,2])),3) # character to numeric values
str(ch_df); summary(ch_df) # tibble with date variable and numeric variable
## tibble [507 x 2] (S3: tbl_df/tbl/data.frame)
##  $ date         : Date[1:507], format: "1980-01-01" "1980-02-01" ...
##  $ chicken price: num [1:507] 0.699 0.673 0.655 0.638 0.628 0.64 0.71 0.751 0.791 0.792 ...
##       date            chicken price  
##  Min.   :1980-01-01   Min.   :0.000  
##  1st Qu.:1990-07-16   1st Qu.:0.884  
##  Median :2001-02-01   Median :1.048  
##  Mean   :2001-01-30   Mean   :1.092  
##  3rd Qu.:2011-08-16   3rd Qu.:1.304  
##  Max.   :2022-03-01   Max.   :1.747
which(is.na(ch_df[,2])) # no missing values
## integer(0)
which(ch_df[,2]==0) # missing value was converted to zero value in row 485
## [1] 485
ch_df[481:490,] # view zero value in row 485
## # A tibble: 10 x 2
##    date       `chicken price`
##    <date>               <dbl>
##  1 2020-01-01            1.41
##  2 2020-02-01            1.36
##  3 2020-03-01            1.4 
##  4 2020-04-01            1.57
##  5 2020-05-01            0   
##  6 2020-06-01            1.75
##  7 2020-07-01            1.71
##  8 2020-08-01            1.61
##  9 2020-09-01            1.54
## 10 2020-10-01            1.58
ch_df[485,2] <- NA # convert zero value back to missing (NA) value
ch_df[481:490,] # view missing value (NA) in row 485
## # A tibble: 10 x 2
##    date       `chicken price`
##    <date>               <dbl>
##  1 2020-01-01            1.41
##  2 2020-02-01            1.36
##  3 2020-03-01            1.4 
##  4 2020-04-01            1.57
##  5 2020-05-01           NA   
##  6 2020-06-01            1.75
##  7 2020-07-01            1.71
##  8 2020-08-01            1.61
##  9 2020-09-01            1.54
## 10 2020-10-01            1.58
# impute missing value using linear interpolation
ch_df[485,2] <- na_interpolation(ts(ch_df))[485,2]
ch_df[481:490,] # view imputed value in row 485
## # A tibble: 10 x 2
##    date       `chicken price`
##    <date>               <dbl>
##  1 2020-01-01            1.41
##  2 2020-02-01            1.36
##  3 2020-03-01            1.4 
##  4 2020-04-01            1.57
##  5 2020-05-01            1.66
##  6 2020-06-01            1.75
##  7 2020-07-01            1.71
##  8 2020-08-01            1.61
##  9 2020-09-01            1.54
## 10 2020-10-01            1.58
## Convert data frome to univariate time series object with defined attributes
ch_ts <- ts(data = ch_df[,2], # series values
         start = c(1980,1), # time of first observation: January 1, 1980
         end = c(2022,3), # time of last observation: March 1, 2022
         frequency = 12) # series frequency
ts_info(ch_ts) # ts object with 1 variable and 507 observations
##  The ch_ts series is a ts object with 1 variable and 507 observations
##  Frequency: 12 
##  Start time: 1980 1 
##  End time: 2022 3

This univariate time series data set contains the average monthly values for the price of fresh, whole chicken in the United States, measured in dollars per pound, from January 1st, 1980, to March 1st, 2022, for a total of 507 months. The missing value for May 1st, 2020, was imputed using linear interpolation.

Exploratory Data Analysis

Plot of observed values

## Plot time series with "plot" function from R built-in "graphics" package
plot.ts(ch_ts, # univariate time series object
        main="U.S. Average Price of Whole Chicken",
        ylab="Cost per Pound",
        xlab="Years")

## Plot time series with "ts_plot" function from "TSstudio" package
ts_plot(ch_ts, # univariate time series object
        title = "U.S. Average Price of Whole Chicken",
        Ytitle = "Cost per Pound",
        Xtitle = "Years",
        Xgrid = TRUE,
        Ygrid = TRUE)
## Plot time series with "dygraph" function from "dygraphs" package
dygraph(ch_ts,
        main = "U.S. Average Price of Whole Chicken",
        ylab = "Cost per Pound",
        xlab = "Years")

Decomposition of time series data

## Classical decomposition of time series
ch_dc <- decompose(ch_ts) # function from R built-in "stats" package
ts_decompose(ch_ts) # function from "TSstudio" package
class(ch_dc); summary(ch_dc) # review attributes of decomposed time series
## [1] "decomposed.ts"
##          Length Class  Mode     
## x        507    ts     numeric  
## seasonal 507    ts     numeric  
## trend    507    ts     numeric  
## random   507    ts     numeric  
## figure    12    -none- numeric  
## type       1    -none- character
plot(ch_dc) # plot decomposed time series

The time series has a growing trend with an embedded cycle, which are both apparent in the observed series. The most recent cycle started just before 2010, near the end of the Great Recession that began in 2008. The seasonal component is not apparent in the observed series. The impact of the COVID-19 pandemic from 2020 to 2022 is conspicuous in both the observed series and the random component. The time series plot can be decomposed to show the trend (including cycle), seasonal, and random components.

Seasonality analysis

## Time series heatmap
ts_heatmap(ch_ts, title = "Heatmap - US Average Price of Whole Chicken")
## Seasonality plots
ggseasonplot(ch_ts) # seasonal plot

ggseasonplot(ch_ts,polar = TRUE) # polar plot

ts_seasonal(ch_ts,type = "normal")
ts_seasonal(ch_ts,type = "cycle")
ts_seasonal(ch_ts,type = "box")
ts_seasonal(ch_ts,type = "all")
## Box plot of the seasonal component of the detrended series
ch_dt <- ch_ts - decompose(ch_ts)$trend # remove trend
ts_seasonal(ch_dt,type = "box") # box plot
## Warning: Ignoring 2 observations

## Warning: Ignoring 2 observations

## Warning: Ignoring 2 observations
## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

The heatmap shows evidence of cyclic behavior (across the vertical bars), but not seasonal behavior (across the horizontal bars). All four seasonality plots lack evidence for seasonal behavior in the time series based on the following behavior:

  • Horizontal lines in the standard plot
  • Rope appearance in the cycle plot
  • Horizontal pattern across the box plots
  • Circular spiral pattern in the polar plot

Correlation analysis

par(mfrow=c(1,2)) # display two plots in one row
acf(ch_ts, lag.max = 60) # autocorrelation function
pacf(ch_ts, lag.max = 60) # partial autocorrelation function

ts_cor(ch_ts) # acf and pacf
ts_lags(ch_ts) # lag plots
ts_lags(ch_ts,lags = c(12,24,36,48)) # seasonal lags
checkresiduals(ch_ts) # residual analysis
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

Box.test(ch_ts,type = "Ljung") # Ljung-Box test of autocorrelation
## 
##  Box-Ljung test
## 
## data:  ch_ts
## X-squared = 497.53, df = 1, p-value < 2.2e-16

The correlation of the series with its lags is decaying gradually over time, with no apparent seasonal component.

The lack of seasonality makes sense given that beef is a food eaten year-round in the United States.

Forecasting Strategies

Forecasting with Linear Regression

## Linear regression forecasting model
h <- 12 # forecast horizon = last 12 observations
ch_split <- ts_split(ch_ts,sample.out = h) # partition of data set
ch_train <- ch_split$train # training set
ch_test <- ch_split$test # test set
ch_lr1 <- tslm(ch_train ~ season + trend + I(trend^2)) # trained model
summary(ch_lr1)
## 
## Call:
## tslm(formula = ch_train ~ season + trend + I(trend^2))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.206090 -0.040381 -0.008574  0.040342  0.178937 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.176e-01  1.313e-02  54.637   <2e-16 ***
## season2     -8.652e-04  1.438e-02  -0.060    0.952    
## season3      1.861e-03  1.438e-02   0.129    0.897    
## season4      5.898e-03  1.447e-02   0.408    0.684    
## season5      3.263e-03  1.447e-02   0.225    0.822    
## season6      1.240e-02  1.447e-02   0.857    0.392    
## season7      1.974e-02  1.447e-02   1.364    0.173    
## season8      1.528e-02  1.447e-02   1.056    0.291    
## season9      1.573e-02  1.447e-02   1.087    0.278    
## season10     1.081e-02  1.447e-02   0.747    0.456    
## season11     1.073e-02  1.447e-02   0.742    0.459    
## season12     3.848e-03  1.447e-02   0.266    0.790    
## trend        7.478e-04  8.310e-05   8.998   <2e-16 ***
## I(trend^2)   2.113e-06  1.623e-07  13.024   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06591 on 481 degrees of freedom
## Multiple R-squared:  0.941,  Adjusted R-squared:  0.9395 
## F-statistic: 590.6 on 13 and 481 DF,  p-value: < 2.2e-16
## Forecast the next 12 months (testing set)
fc_lr1 <- forecast(ch_lr1,h=12)
plot_forecast(fc_lr1)
## Evaluate model performance
accuracy(fc_lr1,ch_test)
##                         ME       RMSE        MAE        MPE     MAPE      MASE
## Training set -3.802647e-18 0.06496906 0.05047630 -0.3418192 4.699188 0.9732557
## Test set     -8.431297e-02 0.11498445 0.09802013 -5.7189811 6.514060 1.8899690
##                   ACF1 Theil's U
## Training set 0.9144903        NA
## Test set     0.6838286  3.001855
test_forecast(actual = ch_ts,
              forecast.obj = fc_lr1,
              test = ch_test)
## Analyze model residuals
checkresiduals(fc_lr1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Linear regression model
## Q* = 3256.4, df = 10, p-value < 2.2e-16
## 
## Model df: 14.   Total lags used: 24

The model coefficients for the intercept and the trend components are statistically significant at a level of less than 0.001. None of the coefficients for the seasonal components are statistically significant.

The MAPE is 4.70% on the training set and 6.51% on the test set.

The adjusted R-squared has a value of 0.94, which suggests that most of the variation is explained by the model. However, analysis of residuals shows significant correlation in the model between the series and its lags. This is confirmed by the statistically-significant p-value of the Ljung-Box test, rejecting the null hypothesis that the random component is white noise. This means that the model does not capture a majority of the variation patterns of the series. Therefore, it is not a valid model for consideration. However, we will use its MAPE score of 6.51% as a benchmark to evaluate the performance of the other models that we will train.

Forecasting with Exponential Smoothing Models

Holt-Winters model

The time series data set was partitioned into a training set consisting of the values of the first 495 months, and a test set consisting of the last 12 months.

## Create training and testing partitions
ch_split <- ts_split(ch_ts,12)
ch_train <- ch_split$train
ch_test <- ch_split$test

## Forecast the last 12 months of the series (testing set)
ch_hw1 <- HoltWinters(ch_train) # train a model
ch_hw1 # review the parameters and error rate of the trained model
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = ch_train)
## 
## Smoothing parameters:
##  alpha: 0.8483446
##  beta : 0
##  gamma: 1
## 
## Coefficients:
##             [,1]
## a    1.552731393
## b    0.002661131
## s1   0.025892031
## s2   0.022691819
## s3   0.022586677
## s4  -0.011744177
## s5  -0.037994348
## s6  -0.033122943
## s7  -0.002522652
## s8  -0.006906191
## s9  -0.013412400
## s10 -0.019775039
## s11 -0.016828074
## s12 -0.009731393
## Forecast the next 12 months (testing set)
fc_hw1 <- forecast(ch_hw1,h=12)
plot_forecast(fc_hw1)
## Evaluate model performance
accuracy(fc_hw1,ch_test)
##                        ME       RMSE        MAE        MPE     MAPE      MASE
## Training set -0.001103989 0.02898050 0.02045626 -0.1396244 1.876760 0.3944261
## Test set     -0.015289850 0.08053015 0.07144645 -1.2463203 4.622068 1.3775903
##                   ACF1 Theil's U
## Training set 0.1173547        NA
## Test set     0.6876416  2.028356
test_forecast(actual = ch_ts,
              forecast.obj = fc_hw1,
              test = ch_test)
## Analyze model residuals
checkresiduals(fc_hw1)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

The Holt-Winters model is mainly learning from the level and seasonal update (with \(\alpha=0.85\) and \(\gamma=1\)). On the other hand, there is no learning from the trend value (\(\beta=0\)). This makes sense given the global pandemic and overlapping period of acute inflation occurring from early 2020 to the present, which have no precedent since the beginning of the training set in 1984. This can be seen in the plot of model performance, which underestimates the peaks from 2020 to 2022.

The MAPE score is 1.88% in the training set and 4.62% in the testing set. However, residual analysis shows significant autocorrelation in the series with its lags, so we conclude that this is not a valid forecasting model.

We will next train the Holt-Winters model using a grid search. We will start with a shallow search with larger increments for the tuning parameters. This will narrow the search area for a deeper search of the tuning parameters. The shallow search will consist of backtesting on the training data set, with an expanding window of six different periods spaced six months apart. Each of the three tuning parameters will be initialized to a range of 0 to 1, with an increment of 0.1.

shallow_grid <- ts_grid(ch_train,
                        model = "HoltWinters",
                        periods = 6, # number of backtesting periods
                        window_length = NULL, # expanding window
                        window_space = 6, # length between training partitions
                        window_test = 12, # length of testing partition
                        # tuning parameters
                        hyper_params = list(alpha = seq(0,1,0.1),
                                            beta = seq(0,1,0.1),
                                            gamma = seq(0,1,0.1)))
## Warning in ts_grid(ch_train, model = "HoltWinters", periods = 6, window_length =
## NULL, : The value of the 'alpha' parameter cannot be equal to 0 replacing 0 with
## 1e-5
## Warning in ts_grid(ch_train, model = "HoltWinters", periods = 6, window_length =
## NULL, : The value of the 'beta' parameter cannot be equal to 0 replacing 0 with
## 1e-5
## Warning in ts_grid(ch_train, model = "HoltWinters", periods = 6, window_length =
## NULL, : The value of the 'gamma' parameter cannot be equal to 0 replacing 0 with
## 1e-5
## Warning: Strategy 'multiprocess' is deprecated in future (>= 1.20.0). Instead,
## explicitly specify either 'multisession' or 'multicore'. In the current R
## session, 'multiprocess' equals 'multisession'.
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
shallow_grid$grid_df[1:10,] # sorted by lowest to highest mean error rate
##    alpha beta gamma        1        2        3        4        5        6
## 1  1e-05  1.0   0.8 1.767340 2.058503 3.126945 4.972858 6.063063 6.196010
## 2  1e-05  0.9   0.8 1.770942 2.062246 3.129872 4.976019 6.062456 6.190128
## 3  1e-05  0.8   0.8 1.774574 2.066020 3.132823 4.979205 6.061845 6.184201
## 4  1e-05  0.7   0.8 1.778235 2.069825 3.135798 4.982416 6.061229 6.178229
## 5  1e-05  0.6   0.8 1.781926 2.073660 3.138797 4.985653 6.060608 6.172211
## 6  1e-05  0.5   0.8 1.785648 2.077526 3.141821 4.988916 6.059982 6.166146
## 7  1e-05  0.4   0.8 1.789399 2.081423 3.144869 4.992205 6.059351 6.160035
## 8  1e-05  0.3   0.8 1.793181 2.085351 3.147942 4.995520 6.058715 6.153877
## 9  1e-05  0.2   0.8 1.796994 2.089311 3.151039 4.998862 6.058075 6.147671
## 10 1e-05  0.1   0.8 1.800838 2.093303 3.154162 5.002230 6.057429 6.141418
##        mean
## 1  4.030786
## 2  4.031944
## 3  4.033111
## 4  4.034289
## 5  4.035476
## 6  4.036673
## 7  4.037880
## 8  4.039098
## 9  4.040325
## 10 4.041563

The table displays the results of the shallow grid search. The models are sorted from best to worst, according to the combination of tuning parameters having the lowest mean error rates. The optimal range of \(\alpha\) varies between 0.2 and 0.3, \(\beta\) is constant at 0.2, and \(\gamma\) is between 0.1 and 0.7. This will help us narrow the ranges of parameter values for a deeper grid search.

deep_grid <- ts_grid(ch_train,
                     model = "HoltWinters",
                     periods = 6, # number of backtesting periods
                     window_length = NULL, # expanding window
                     window_space = 6, # length between training partitions
                     window_test = 12, # length of testing partition
                     # tuning parameters
                     hyper_params = list(alpha = seq(0,0.1,0.01),
                                         beta = seq(0.1,1.0,0.01),
                                         gamma = seq(0.75,0.85,0.01)))
## Warning in ts_grid(ch_train, model = "HoltWinters", periods = 6, window_length =
## NULL, : The value of the 'alpha' parameter cannot be equal to 0 replacing 0 with
## 1e-5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
deep_grid$grid_df[1:10,] # sorted from lowest to highest mean error rate
##    alpha beta gamma        1        2        3        4        5        6
## 1   0.01 0.10  0.75 1.481318 1.665221 2.710497 4.447313 6.157248 7.465748
## 2   0.01 0.12  0.75 1.486573 1.631147 2.623509 4.348712 6.177763 7.661964
## 3   0.01 0.11  0.75 1.484054 1.647867 2.669365 4.399760 6.167436 7.562749
## 4   0.01 0.13  0.75 1.494189 1.626541 2.574063 4.295410 6.187974 7.761380
## 5   0.01 0.10  0.76 1.480825 1.667580 2.710790 4.444822 6.155434 7.483908
## 6   0.01 0.11  0.76 1.483358 1.650934 2.671270 4.399020 6.165293 7.578203
## 7   0.01 0.12  0.76 1.485677 1.636831 2.627160 4.349802 6.175304 7.674737
## 8   0.01 0.14  0.75 1.514646 1.622276 2.522091 4.240978 6.197852 7.859271
## 9   0.01 0.10  0.77 1.480292 1.669995 2.711613 4.442515 6.153814 7.502181
## 10  0.01 0.13  0.76 1.495063 1.632556 2.579553 4.298360 6.185221 7.771572
##        mean
## 1  3.987891
## 2  3.988278
## 3  3.988538
## 4  3.989926
## 5  3.990560
## 6  3.991346
## 7  3.991585
## 8  3.992853
## 9  3.993402
## 10 3.993721

The error range of the top 10 models has dropped slightly compared to the shallow search. The next step is to retrain the HW model using the optimal values of the smoothing parameters from the deep grid search.

## Retrain HW model using optimal parameter values from deep grid search
ch_hw2 <- HoltWinters(ch_train, # training set
                          # tuning parameters from deep grid search
                          alpha = deep_grid$alpha,
                          beta = deep_grid$beta,
                          gamma =  deep_grid$gamma)
ch_hw2 # review the parameters and error rate of the retrained model
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = ch_train, alpha = deep_grid$alpha, beta = deep_grid$beta,     gamma = deep_grid$gamma)
## 
## Smoothing parameters:
##  alpha: 0.01
##  beta : 0.1
##  gamma: 0.75
## 
## Coefficients:
##             [,1]
## a    1.617683445
## b    0.002471430
## s1  -0.034052087
## s2   0.033793515
## s3   0.117248756
## s4   0.083424469
## s5  -0.005542369
## s6  -0.063785998
## s7  -0.029701178
## s8  -0.006291656
## s9  -0.020333580
## s10 -0.049664924
## s11 -0.069951829
## s12 -0.095995759
## Forecast the next 12 months (testing set)
fc_hw2 <- forecast(ch_hw2,h=12)
plot_forecast(fc_hw2)
## Evaluate the model's performance with testing set
accuracy(fc_hw2,ch_test)
##                         ME      RMSE        MAE        MPE     MAPE      MASE
## Training set -0.0003927547 0.0621628 0.04758654 -0.3364691 4.513327 0.9175369
## Test set     -0.0740100206 0.1451797 0.11717752 -5.1865096 7.739326 2.2593509
##                   ACF1 Theil's U
## Training set 0.8607513        NA
## Test set     0.6742600   3.83454
test_forecast(actual = ch_ts,
              forecast.obj = fc_hw2,
              test = ch_test)
## Analyze model residuals
checkresiduals(fc_hw2)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

As you can see from the plot of fitted and forecasted values, the HW model obtained from the grid search underestimated the peaks from 2020 to 2022, with an MAPE score of 7.74% on the test set. Correlation analysis suggests that the random component is not white noise, meaning that this is not a valid model for consideration.

Forecasting with ARIMA Models

Transforming a non-stationary series into a stationary series

## Plot of raw time series
ts_plot(ch_ts,
        title = "Monthly Chicken Prices 1980-2022",
        Ytitle = "Cost per Pound",
        Xtitle = "Year")
## Log transformation of time series
ts_plot(log(ch_ts),
        title = "Chicken Price Series - Log Transformation",
        Ytitle = "Log of Cost per Pound",
        Xtitle = "Year")
## First order differencing
ts_plot(diff(ch_ts,lag=1),
        title = "Chicken Price Series - First Differencing",
        Ytitle = "Differencing of Cost per Pound",
        Xtitle = "Year")
## First order and seasonal differencing
ts_plot(diff(diff(ch_ts,lag=1),12),
        title = "Chicken Price Series - First and Seasonal Differencing",
        Ytitle = "Differencing of Cost per Pound",
        Xtitle = "Year")
## Log transformation and first order differencing
ts_plot(diff(log(ch_ts),lag=1),
        title = "Chicken Price Series - First Differencing with Log Transformation",
        Ytitle = "Differencing/Log of Cost per Pound",
        Xtitle = "Year")
## Log transformation with first order and seasonal differencing
ts_plot(diff(diff(log(ch_ts),lag=1),12),
        title = 
          "Chicken Price Series - First and Seasonal Differencing with Log Transformation",
        Ytitle = "Differencing/Log of Cost per Pound",
        Xtitle = "Year")

The log transformation with first-order differencing appears to do the best job of transforming the series to a stationary state and stabilizing the series variation. We will next use this sequence of transformations to manually fit an ARIMA model.

Fitting an ARIMA model with a manual process

Autoregressive Integrated Moving Average (ARIMA)

par(mfrow=c(1,2)) # display two plots in one row
acf(diff(log(ch_ts)),lag.max = 60) # autocorrelation function (ACF)
pacf(diff(log(ch_ts)),lag.max = 60) # partial autocorrelation function (PACF)

ts_cor(diff(log(ch_ts))) # acf and pacf
## Train an ARIMA(0,1,0) model
arima_m1 <- arima(log(ch_ts),order = c(0,1,0))
summary(arima_m1)
## 
## Call:
## arima(x = log(ch_ts), order = c(0, 1, 0))
## 
## 
## sigma^2 estimated as 0.0005788:  log likelihood = 1168.04,  aic = -2334.08
## 
## Training set error measures:
##                       ME       RMSE        MAE       MPE     MAPE      MASE
## Training set 0.001779869 0.02403351 0.01771173 -7.707259 25.20289 0.9980674
##                    ACF1
## Training set 0.07686383
checkresiduals(arima_m1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 54.622, df = 24, p-value = 0.0003519
## 
## Model df: 0.   Total lags used: 24
arima_m1_fc <- forecast(arima_m1,h=12)
plot_forecast(arima_m1_fc)

A log transformation was applied to the series, followed by first-order differencing. The transformed series appears to tail off in both the autocorrelation function (ACF) and the partial autocorrelation function (PACF). There is no apparent seasonality pattern. Therefore, an ARIMA(0,1,0) was trained, resulting in an MAPE score of 25.2%. Furthermore, residual analysis show that the series has significant autocorrelation with its lags.

Fitting an ARIMA model with an automated tuning process

## Automated ARIMA model tuning using default arguments
arima_a1 <- auto.arima(ch_train) # tune model using training set
summary(arima_a1)
## Series: ch_train 
## ARIMA(2,1,2) with drift 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2   drift
##       1.2360  -0.4146  -1.2571  0.3431  0.0018
## s.e.  0.2543   0.2306   0.2602  0.2550  0.0006
## 
## sigma^2 = 0.0007132:  log likelihood = 1091.14
## AIC=-2170.29   AICc=-2170.12   BIC=-2145.07
## 
## Training set error measures:
##                       ME       RMSE        MAE         MPE     MAPE      MASE
## Training set 4.38536e-07 0.02654339 0.01889391 -0.06125949 1.737348 0.3643017
##                      ACF1
## Training set -0.006286073
checkresiduals(arima_a1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2) with drift
## Q* = 26.429, df = 19, p-value = 0.1187
## 
## Model df: 5.   Total lags used: 24
arima_a1_fc <- forecast(arima_a1,h=12)
plot_forecast(arima_a1_fc)
## Automated ARIMA model tuning using robust search
arima_a2 <- auto.arima(ch_train, # training set
                       # limit order of model to six
                       max.order = 5, # p+q+P+Q=5
                       d=1, # non-seasonal differencing
                       D=0, # seasonal differencing
                       stepwise = FALSE, # search all possible combinations
                       approximation = FALSE) # for more accurate calculations
summary(arima_a2)
## Series: ch_train 
## ARIMA(0,1,5) with drift 
## 
## Coefficients:
##           ma1      ma2      ma3      ma4      ma5   drift
##       -0.0278  -0.0676  -0.0792  -0.1975  -0.1030  0.0018
## s.e.   0.0446   0.0452   0.0443   0.0469   0.0453  0.0006
## 
## sigma^2 = 0.0007072:  log likelihood = 1093.71
## AIC=-2173.42   AICc=-2173.19   BIC=-2144
## 
## Training set error measures:
##                         ME       RMSE        MAE         MPE   MAPE      MASE
## Training set -1.273792e-05 0.02640456 0.01876925 -0.06024449 1.7265 0.3618981
##                      ACF1
## Training set 7.304486e-05
checkresiduals(arima_a2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,5) with drift
## Q* = 21.455, df = 18, p-value = 0.2571
## 
## Model df: 6.   Total lags used: 24
## Forecast the next 12 months (testing set)
arima_a2_fc <- forecast(arima_a2,h=12)
plot_forecast(arima_a2_fc)

With its default parameters, the automated tuning process fitted an ARIMA(2,1,2) model that includes a drift term. With a robust search, the automated tuning process fit an ARIMA(0,1,5) with drift. Although both models have very similar performance, the ARIMA(0,1,5) with drift scored lower on all the error metrics. It has an MAPE score of 1.73%, and residual analysis indicating that its random component is white noise.

Linear regression with ARIMA errors

We will next train a linear regression model having the following three predictors: trend, 12-month seasonal lag, and a categorical variable for month of the year. Additionally, the errors will be modeled using the ARIMA procedure.

## Prepare data and create new features for the series
ch_df$lag12 <- dplyr::lag(ch_df$`chicken price`,n=12) # seasonal lag
# seasonal component
ch_df$month <- factor(month(ch_df$date,label=TRUE),ordered = FALSE)
# marginal change in series from moving in time by one month
ch_df$trend <- 1:nrow(ch_df)

## Split the series into training and testing partitions
train_df <- ch_df[1:(nrow(ch_df)-12),]
test_df <- ch_df[(nrow(ch_df)-12+1):nrow(ch_df),]

## Train linear regression model with ARIMA errors using xreg argument
arima_e1 <- auto.arima(ch_train, # training set
                       # change month from categorical to binary variable
                       # drop first column (category)
                       xreg = cbind(model.matrix(~month,train_df)[,-1],
                                    train_df$trend,
                                    train_df$lag12),
                       seasonal = TRUE, # search seasonal and non-seasonal models
                       stepwise = FALSE, # search all possible combinations
                       approximation = FALSE) # for more accurate calculations
summary(arima_e1)
## Series: ch_train 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3      ma4  intercept  monthFeb  monthMar
##       0.9667  -0.0346  -0.0595  -0.0539  -0.1499     0.6376   -0.0001    0.0032
## s.e.  0.0150   0.0490   0.0492   0.0479   0.0509     0.0569    0.0041    0.0056
##       monthApr  monthMay  monthJun  monthJul  monthAug  monthSep  monthOct
##         0.0064    0.0042    0.0134    0.0194    0.0139    0.0135    0.0086
## s.e.    0.0066    0.0072    0.0074    0.0075    0.0074    0.0073    0.0066
##       monthNov  monthDec                
##         0.0094    0.0028  0.0018  0.0031
## s.e.    0.0057    0.0041  0.0002  0.0518
## 
## sigma^2 = 0.0006834:  log likelihood = 1077.22
## AIC=-2114.44   AICc=-2112.62   BIC=-2030.84
## 
## Training set error measures:
##                         ME       RMSE        MAE         MPE     MAPE      MASE
## Training set -0.0003500908 0.02595211 0.01817991 -0.09624445 1.641124 0.3505347
##                    ACF1
## Training set 0.01026581
checkresiduals(arima_e1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,4) errors
## Q* = 23.797, df = 5, p-value = 0.0002375
## 
## Model df: 19.   Total lags used: 24
## Forecast the next 12 months (testing set)
arima_e1_fc <- forecast(arima_e1,xreg = cbind(model.matrix(~month,test_df)[,-1],
                                              test_df$trend,
                                              test_df$lag12))
plot_forecast(arima_e1_fc)

The linear regression model with ARIMA errors has an MAPE score of 1.64%, and the correlation analysis indicates that the random component is white noise.

Forecasting with Machine Learning Models

We will next use several machine learning regression models. Given that the purpose is to obtain a short-term forecast of 12 months, using the entire series may add noise to the model from previous cycles. Therefore, we will instead subset the series to only include the most recent cycle, beginning in January 2010, following the end of the 2008 economic crisis.

## Subset time series object and transform to a data frame
ch_df_st <- ts_to_prophet(window(ch_ts,start=c(2010,1)))

## Examine data subset
class(ch_df_st); dim(ch_df_st) # data frame with 147 rows, 2 columns
## [1] "data.frame"
## [1] 147   2
str(ch_df_st); summary(ch_df_st) # date variable and numeric variable
## 'data.frame':    147 obs. of  2 variables:
##  $ ds: Date, format: "2010-01-01" "2010-02-01" ...
##  $ y : num  1.26 1.26 1.23 1.23 1.26 ...
##        ds                   y        
##  Min.   :2010-01-01   Min.   :1.230  
##  1st Qu.:2013-01-16   1st Qu.:1.427  
##  Median :2016-02-01   Median :1.484  
##  Mean   :2016-01-31   Mean   :1.464  
##  3rd Qu.:2019-02-15   3rd Qu.:1.523  
##  Max.   :2022-03-01   Max.   :1.747
head(ch_df_st,5); tail(ch_df_st,5) # view first 5 and last 5 observations
##           ds     y
## 1 2010-01-01 1.265
## 2 2010-02-01 1.265
## 3 2010-03-01 1.231
## 4 2010-04-01 1.230
## 5 2010-05-01 1.259
##             ds     y
## 143 2021-11-01 1.583
## 144 2021-12-01 1.606
## 145 2022-01-01 1.622
## 146 2022-02-01 1.632
## 147 2022-03-01 1.724
names(ch_df_st) # variable names: "ds" and "y"
## [1] "ds" "y"
## Reformat data
names(ch_df_st) <- c("date","y") # rename variables
head(ch_df_st,5) # view first five observations
##         date     y
## 1 2010-01-01 1.265
## 2 2010-02-01 1.265
## 3 2010-03-01 1.231
## 4 2010-04-01 1.230
## 5 2010-05-01 1.259
## Plot time series
ts_plot(ch_df_st, # data frame
        title = "US Average Chicken Price Since January 2010",
        Ytitle = "Cost per Pound",
        Xtitle = "Year")

Feature engineering

We will next create new features to be used as informative input for the model.

## Create new features and add to time series data frame 
ch_df_st <- ch_df_st %>% 
  # categorical variable for month of year to capture seasonality
  mutate(month=factor(month(date,label = TRUE),ordered = FALSE),
         # variable for seasonal lag of 12 months
         lag12=lag(y,n=12)) %>% filter(!is.na(lag12))
ch_df_st$trend <- 1:nrow(ch_df_st) # trend component
ch_df_st$trend_sqr <- ch_df_st$trend^2 # second-degree polynomial of trend
str(ch_df_st); head(ch_df_st) # view structure of data frame with new features
## 'data.frame':    135 obs. of  6 variables:
##  $ date     : Date, format: "2011-01-01" "2011-02-01" ...
##  $ y        : num  1.24 1.27 1.27 1.26 1.3 ...
##  $ month    : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lag12    : num  1.26 1.26 1.23 1.23 1.26 ...
##  $ trend    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ trend_sqr: num  1 4 9 16 25 36 49 64 81 100 ...
##         date     y month lag12 trend trend_sqr
## 1 2011-01-01 1.241   Jan 1.265     1         1
## 2 2011-02-01 1.266   Feb 1.265     2         4
## 3 2011-03-01 1.269   Mar 1.231     3         9
## 4 2011-04-01 1.261   Apr 1.230     4        16
## 5 2011-05-01 1.302   May 1.259     5        25
## 6 2011-06-01 1.305   Jun 1.239     6        36

Training, testing, and model evaluation

To allow for model comparison, follow the same procedure used for the previous models to create the training and testing partitions. It will also be necessary to create inputs for the forecast itself.

## Create training and testing partitions of the data subset
h <- 12 # forecast horizon = 12 months
train_df_st <- ch_df_st[1:(nrow(ch_df_st)-h),] # exclude last 12 months
test_df_st <- ch_df_st[(nrow(ch_df_st)-h+1):nrow(ch_df_st),] # last 12 months

## Create data frame with the dates of the following 12 months
fc_df_st <- data.frame(
  date=seq.Date(from = max(ch_df_st$date)+months(1),length.out = h,by="month"),
  # trend component
  trend=seq(from=max(ch_df_st$trend)+1,length.out=h,by=1))

## Build the rest of the features
fc_df_st$trend_sqr <- fc_df_st$trend^2 # second-degree polynomial of trend
# categorical variable for month of year to capture seasonality
fc_df_st$month <- factor(month(fc_df_st$date,label = TRUE),ordered = FALSE)
# extract last 12 observations and assign as future lags of the series
fc_df_st$lag12 <- tail(ch_df_st$y,12)
str(fc_df_st); head(fc_df_st) # view structure of data frame with new features
## 'data.frame':    12 obs. of  5 variables:
##  $ date     : Date, format: "2022-04-01" "2022-05-01" ...
##  $ trend    : num  136 137 138 139 140 141 142 143 144 145 ...
##  $ trend_sqr: num  18496 18769 19044 19321 19600 ...
##  $ month    : Factor w/ 12 levels "Jan","Feb","Mar",..: 4 5 6 7 8 9 10 11 12 1 ...
##  $ lag12    : num  1.51 1.49 1.47 1.44 1.47 ...
##         date trend trend_sqr month lag12
## 1 2022-04-01   136     18496   Apr 1.515
## 2 2022-05-01   137     18769   May 1.486
## 3 2022-06-01   138     19044   Jun 1.474
## 4 2022-07-01   139     19321   Jul 1.435
## 5 2022-08-01   140     19600   Aug 1.472
## 6 2022-09-01   141     19881   Sep 1.504

Model benchmark

We will use a linear regression model as a benchmark for the machine learning models.

## Train the linear regression model on the training partition
ch_lr2 <- lm(y ~ month + lag12 + trend + trend_sqr, data = train_df_st)
summary(ch_lr2) # review the model details
## 
## Call:
## lm(formula = y ~ month + lag12 + trend + trend_sqr, data = train_df_st)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.158711 -0.043873 -0.004981  0.047165  0.137115 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.493e-01  1.546e-01   4.846 4.25e-06 ***
## monthFeb    -6.246e-04  2.858e-02  -0.022 0.982604    
## monthMar     6.693e-04  2.858e-02   0.023 0.981363    
## monthApr     1.387e-02  2.933e-02   0.473 0.637310    
## monthMay     2.201e-02  2.934e-02   0.750 0.454794    
## monthJun     3.567e-02  2.937e-02   1.215 0.227179    
## monthJul     3.045e-02  2.940e-02   1.036 0.302735    
## monthAug     1.598e-02  2.932e-02   0.545 0.586959    
## monthSep     8.487e-03  2.933e-02   0.289 0.772871    
## monthOct     1.854e-02  2.947e-02   0.629 0.530595    
## monthNov     1.856e-02  2.934e-02   0.633 0.528304    
## monthDec     1.369e-02  2.933e-02   0.467 0.641651    
## lag12        4.748e-01  1.262e-01   3.763 0.000273 ***
## trend       -3.076e-05  1.176e-03  -0.026 0.979172    
## trend_sqr    5.775e-06  8.065e-06   0.716 0.475526    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06702 on 108 degrees of freedom
## Multiple R-squared:  0.491,  Adjusted R-squared:  0.425 
## F-statistic:  7.44 on 14 and 108 DF,  p-value: 1.151e-10
## Predict the corresponding values of the series on the testing partition
test_df_st$yhat <- predict(ch_lr2,newdata = test_df_st)

## Evaluate the performance of the model on the testing partition
mape_lr2 <- mean(abs(test_df_st$y-test_df_st$yhat)/test_df_st$y)
mape_lr2
## [1] 0.07164973
## Analyze model residuals
checkresiduals(ch_lr2)

## 
##  Breusch-Godfrey test for serial correlation of order up to 18
## 
## data:  Residuals
## LM test = 82.795, df = 18, p-value = 2.767e-10

The residual plot shows a nonrandom pattern. The correlation plot shows that the series is dependent on its lags. Therefore, it is not a valid forecasting model. However, we will use its MAPE score of 10.04% as a benchmark for the performance of the machine learning models.

Starting an h2o cluster

## Set the in-memory cluster with the H2O function
h2o.init() # provides information about the cluster's setup
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 19 hours 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.36.0.4 
##     H2O cluster version age:    1 month and 4 days  
##     H2O cluster name:           H2O_started_from_R_keoka_pdb659 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.55 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.1.2 (2021-11-01)
## Transform data frame objects to h2o clusters
train_h <- as.h2o(train_df_st) # training set
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_h <- as.h2o(test_df_st) # test set
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
forecast_h <- as.h2o(fc_df_st) # future values of series inputs
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## Label the names of the independent and dependent variables for convenience
x <- c("month","lag12","trend","trend_sqr")
y <- "y"

Now that the data has been loaded into the working cluster, we can begin the training process.

Forecasting with the Random Forest model

Build a forecasting model with the Random Forest (RF) algorithm.

## Random Forest model with default settings
ch_rf1 <- h2o.randomForest(training_frame = train_h, # training set
                          nfolds = 5, # number of folders for CV training
                          x=x, # character vector for independent variable names
                          y=y, # string for name of dependent variable
                          ntrees = 500, # number of trees
                          # number of rounds to use before stopping training
                          stopping_rounds = 10,
                          # determine when model should stop and build new trees
                          stopping_metric = "RMSE", # error metric
                          # score during each iteration of model training
                          score_each_iteration = TRUE,
                          # minimal improvement to continue training process
                          stopping_tolerance = 0.0001,
                          seed = 1234)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |======================================================================| 100%
## View contribution of model inputs
ch_rf1@model$model_summary # review parameters of model performance
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              28                       28               26501        11
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        15   12.96429         58         84    70.60714
h2o.varimp_plot(ch_rf1) # each variable ranked on a scale from 0 to 1

## Plot the learning process of the model as a function of the number of trees
tree_score <- ch_rf1@model$scoring_history$training_rmse
plot_ly(x=seq_along(tree_score),y=tree_score,
        type = "scatter",mode="line")%>%
  layout(title="The Trained Model Score History",
         yaxis=list(title="RMSE"),
         xaxis=list(title="Number of Trees"))
## Warning: Ignoring 1 observations
## Measure the model's performance on the testing partition
test_h$pred_rf <- h2o.predict(ch_rf1,test_h) # predict corresponding values
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_1 <- as.data.frame(test_h) # transfer to a data frame object
# calculate MAPE score of RF model on test partition
mape_rf1 <- mean(abs(test_1$y-test_1$pred_rf)/test_1$y)
mape_rf1 # output MAPE score
## [1] 0.0661669
## Visualize results and compare prediction to actual and baseline predictions
plot_ly(data = test_1)%>% # use test set data frame as input
  add_lines(x=~date,y=~y,name="Actual")%>%
  add_lines(x=~date,y=~yhat,name="Linear Regression",line=list(dash="dot"))%>%
  add_lines(x=~date,y=~pred_rf,name="Random Forest",
            line=list(dash="dash"))%>%
  layout(title=
           "Average Chicken Price - Actual vs. Prediction (Random Forest)",
         yaxis=list(title="Cost per Pound"),
         xaxis=list(title="Month"))

The Random Forest model with its default settings has an MAPE rate of 0.07%.

Forecasting with the GBM model

## Train the GBM model with the same input data used previously
ch_gb1 <- h2o.gbm(training_frame = train_h, # training set
                  nfolds = 5, # number of folders for CV training
                  x=x, # character vector for independent variable names
                  y=y, # string for name of dependent variable,
                  max_depth = 20, # set the maximum tree depth
                  distribution = "gaussian",
                  ntrees = 500, # number of trees
                  learn_rate = 0.1, # default value between 0 and 1
                  # score the model during each iteration of training
                  score_each_iteration = TRUE)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## Review the rank of the importance of the model's variables
ch_rf1@model$model_summary # review parameters of model performance
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              28                       28               26501        11
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        15   12.96429         58         84    70.60714
h2o.varimp_plot(ch_gb1) # each variable ranked on a scale from 0 to 1

## Plot the learning process of the model as a function of the number of trees
tree_score <- ch_gb1@model$scoring_history$training_rmse
plot_ly(x=seq_along(tree_score),y=tree_score,
        type = "scatter",mode="line")%>%
  layout(title="The Trained Model Score History",
         yaxis=list(title="RMSE"),
         xaxis=list(title="Number of Trees"))
## Test the model's performance on the testing set
test_h$pred_gbm <- h2o.predict(ch_gb1,test_h) # predict corresponding values
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_1 <- as.data.frame(test_h) # transfer to a data frame object
# calculate MAPE score of GBM model on test partition
mape_gb1 <- mean(abs(test_1$y-test_1$pred_gbm)/test_1$y)
mape_gb1 # output MAPE score
## [1] 0.08661554
## Visualize results and compare prediction to actual and baseline predictions
plot_ly(data = test_1)%>% # use test set data frame as input
  add_lines(x=~date,y=~y,name="Actual")%>%
  add_lines(x=~date,y=~yhat,name="Linear Regression",line=list(dash="dot"))%>%
  add_lines(x=~date,y=~pred_gbm,name="Gradient Boosting Machine",
            line=list(dash="dash"))%>%
  layout(title=
        "Average Chicken Price - Actual vs. Prediction (Gradient Boosting Machine)",
         yaxis=list(title="Cost per Pound"),
         xaxis=list(title="Month"))

The Gradient Boosting Machine model with its default settings has an MAPE rate of 0.09%, which is higher than the 10.04% error rate of the benchmark model.

Prediction for March 2023

## Linear regression models
data.frame(forecast(ch_lr1,h=24))[24,] # 1.676747 benchmark
##          Point.Forecast    Lo.80    Hi.80    Lo.95    Hi.95
## Mar 2023       1.676747 1.590177 1.763317 1.544198 1.809295
## Exponential smoothing models
data.frame(forecast(ch_hw1,h=24))[24,] # 1.606867
##          Point.Forecast    Lo.80    Hi.80    Lo.95    Hi.95
## Mar 2023       1.606867 1.449979 1.763755 1.366927 1.846807
data.frame(forecast(ch_hw2,h=24))[24,] # 1.581002
##          Point.Forecast    Lo.80    Hi.80   Lo.95    Hi.95
## Mar 2023       1.581002 1.480253 1.681751 1.42692 1.735084
## ARIMA models
data.frame(forecast(arima_m1,h=24))[24,] # 0.5446472
##          Point.Forecast     Lo.80     Hi.80     Lo.95     Hi.95
## Mar 2024      0.5446472 0.3936087 0.6956856 0.3136538 0.7756405
data.frame(forecast(arima_a1,h=24))[24,] # 1.613422
##          Point.Forecast    Lo.80    Hi.80    Lo.95    Hi.95
## Mar 2023       1.613422 1.514219 1.712625 1.461704 1.765141
data.frame(forecast(arima_a2,h=24))[24,] # 1.618961
##          Point.Forecast   Lo.80    Hi.80    Lo.95    Hi.95
## Mar 2023       1.618961 1.51631 1.721611 1.461971 1.775951